
//Note this file outputs analysis data in $temp. There are older versions of the analysis files in $data that it will not erase.
cap log close
log using "$output\log files\01_data_cleaning.smcl", replace

********************************************************************************
//Merging COMTRADE reported exports with customs data
loc run = 0
if `run' == 1{
use  "$data\EDD importer concentration measures 2024 March\Indicators based on all observations\totalimportsHS6.dta", clear /* Already has the HS combined*/

rename v imp_reported_customs 
rename h h6

merge 1:1 c h6 y using "$data\partner_import_h1.dta" /* Export reported */
drop if _m==2
drop _m
recode exp_reported (.=0) 

gen gap_evasion_imp_customs_log = log(exp_reported) - log(imp_reported_customs)

gen gap_evasion_imp_customs  = (exp_reported-imp_reported_customs)/(imp_reported_customs  + exp_reported)

foreach var of varlist gap_evasion_imp*{
	replace `var'=. if exp_reported==0
}

save  "$temp\mirror_h6_h1.dta", replace
}


********************************************************************************
//Merging everything together
loc run = 1
if `run' == 1{
use "$data\EDD importer concentration measures 2024 March\Indicators based on all observations\CYH6_importerconcentration.dta", clear

rename h6 h
merge 1:1 c y h using "$data\EDD importer concentration measures 2024 March\Indicators based on all observations\totalimportsHS6.dta"
drop if _m==2
rename h h6
drop _m

merge 1:1 c h6 y using "$temp\tariff.dta"
drop if _m==2
//bys c y: egen _mtag = max(_m)
//recode tf_* (. = 0) if _mtag==3
//drop _mtag
drop _m

merge 1:1 c h6 y using "$temp\mirror_h6_h1.dta"
drop if _m==2
drop _m

rename h6 h
merge 1:1 y c h using "$temp\ntm_trains.dta"
drop if _m==2
bys c y: egen _mtag = max(_m)
recode ntm_* (. = 0) if _mtag==3
drop _mtag
rename h h6
drop _m

merge m:1 c y using "$data\gini.dta"
drop if _m==2
drop _m

///// HS4 codes
gen h4=substr(h6,1,4)
rename B1 B1_hs6
merge m:1 h4 y c using "$data\EDD importer concentration measures 2024 March\Indicators based on all observations\CYH4_importerconcentration.dta", keepusing(B1)
drop if _m==2
drop _m
rename B1 B1_hs4
sort c y

///// Type of goods
rename h4 hs4
merge m:1 hs4 using "$data\typeofgoods.dta"
drop if _m==2
drop _m
rename hs4 h4

///// Exporter import share
merge 1:1 c y h6 using "$data\EDD importer concentration measures 2024 March\Indicators based on all observations\CYH6_commoditiesexporterimportshare.dta"
drop _m

///// Commodity export shares
merge m:1 c y using "$data\wdi_v2.dta"
drop if _m==2
drop _m

rename y year
rename c iso3code

///// Commodity price
merge m:1 iso3code year using "$data\commoshockfixedw"
drop if _m==2
drop _m
rename commshockfixw compriceindexfixw

rename year y
rename iso3code c

////// World governance indicators
preserve
use "$data\wgidataset.dta", clear
rename code c
rename year y
tempfile wgi
sort c y
save `wgi'
restore

sort c y
merge m:1 c y using `wgi'
drop if _m==2
drop _m

//Continent dummies
preserve
use "$data\rugged_data.dta", clear
keep isocode cont_*
rename isocode c
tempfile cont
save `cont'
restore

sort c
merge m:1 c using `cont'
drop if _m==2
drop _m

//Transformations of data
replace B1_hs4 = B1_hs4*10000
replace B1_hs6 = B1_hs6*10000
gen B1 = B1_hs6

foreach var of varlist commodity_exports ores_metal_exports food_exports fuel_exports {
	replace `var' = `var'/100
}

gen ln_G1=ln(G1)
gen ln_G2=ln(G2)
gen ln_A1=ln(A1)

replace compriceindexfixw = compriceindexfixw/100 

//Fixed effects
egen yh4 = group(h4 y)
egen yh6 = group(h6 y)
egen cy = group(c y)
egen ch6 = group(c h6)
egen ch4 = group(c h4)


//Labels
lab var B1 "HHI"
lab var B1_hs4 "HHI"
lab var B1_hs6 "HHI"
lab var loggdppc "Log GDP per capita" //PPP
lab var loggdppc2 "Log GDP per capita"
lab var commodity_exports "Commodity export share [0,1]"
lab var ores_metal_exports "Ores and metals export share [0,1]"
lab var food_exports "Food export share [0,1]"
lab var fuel_exports "Fuel export share [0,1]"
lab var compriceindexfixw "Export commodity price index (1=2012)"
lab var cce "Control of corruption"
lab var rqe "Regulatory quality"

label var ln_G1 "Log number of sources countries"
label var ln_G2 "Log unit value"
label var C1 "Firm entry rate"
label var C2 "Firm exit rate"
label var ln_A1 "Log number of importers"
label var B7i "Share of the largest importer"
label var B7iinew "Share of the top 4 largest importers"
label var gini_i "Gini coefficient [0, 1]"
label var gini "Gini coefficient [0, 1]"

/*Z score*/
foreach var of varlist commodity_exports fuel_exports ores_metal_exports food_exports compriceindexfixw gini_i loggdppc{
		sum `var',d
		gen `var'_orig = `var'
		replace `var' =`var'-r(p50)  
}

compress
save "$temp\trade_data_h6_covars_h1.dta", replace

}

log close

